library(tidyverse)
Take A Sad Plot & Make It Better
https://apreshill.github.io/ohsu-biodatavis/slides.html#1
Quick example:
set.seed(1000)
asdpop <- tibble::tibble(
time1 = sample(1:100, 100, replace = F),
time2 = time1) %>%
tidyr::gather(x, y, time1:time2, factor_key = TRUE)
asdpop
Some edits:
asdpop <- asdpop %>%
mutate(services = as.factor(case_when(
x == "time1" & y <= 30 ~ 1,
x == "time1" & y > 30 ~ 0,
x == "time2" & y <= 60 ~ 1,
TRUE ~ 0
)))
asdpop
bar1 <- ggplot(asdpop, aes(x, fill = services))
bar1 <- bar1 + geom_bar(width = .6)
bar1
Simple, right?
library(wesanderson)
ff <- wes_palette("FantasticFox1")[c(2:3)]
bar2 <- bar1 + scale_fill_manual(values = ff)
bar2
bar3 <- bar2 + scale_x_discrete(name = "", labels = c("Time 1", "Time 2"))
bar3 <- bar3 + scale_y_continuous(expand = c(.02, 0),
name = "ASD Cases per 10,000")
bar3 <- bar3 + theme_bw()
bar3 <- bar3 + theme(axis.title = element_text(size = 10))
bar3 <- bar3 + theme(legend.text = element_text(size = 10))
bar3 <- bar3 + theme(legend.title = element_text(size = 10))
bar3 <- bar3 + theme(axis.ticks = element_blank())
bar3 <- bar3 + theme(panel.border = element_blank())
bar3 <- bar3 + theme(axis.line = element_blank())
bar3 <- bar3 + theme(panel.grid = element_blank())
bar3
bar4 <- bar3 + annotate("text", label = "Accessing \nServices",
x = 2, y = 30, size = 4, color = "white",
fontface = "bold")
bar4 <- bar4 + annotate("text", label = "Not \nAccessing \nServices",
x = 2, y = 80, size = 4, color = "white",
fontface = "bold")
bar4 <- bar4 + guides(fill = FALSE)
bar4
# add the top horizontal line for population prevalence
bar5 <- bar4 + geom_segment(aes(x = .6, xend = 2.45, y = 100, yend = 100),
lty = 3, lwd = .3, colour = "black")
bar5
bar6 <- bar5 + coord_cartesian(ylim = c(0, 102), xlim = c(1, 3.2))
bar6 <- bar6 + annotate("text",
x = 2.5, y = 97, size = 4, hjust = 0,
label = "Estimates of prevalence based\non population sampling will remain\nstable over time if true prevalence\nis stable.")
bar6
# add segments to track sample prevalence
bar7 <- bar6 + geom_segment(aes(x = .6, xend = 1.3, y = 30, yend = 30),
lty = 3, lwd = .5, colour = ff[2])
bar7 <- bar7 + geom_segment(aes(x = 1.3, xend = 1.7, y = 30, yend = 60),
lty = 3, lwd = .5, colour = ff[2])
bar7 <- bar7 + geom_segment(aes(x = 1.7, xend = 2.45, y = 60, yend = 60),
lty = 3, lwd = .5, colour = ff[2])
bar7
How were those specific numbers obtained?
asdpop %>%
group_by(services) %>%
summarize(minimum = min(y),
maximum = max(y))
bar8 <- bar7 + annotate("text",
x = 2.5, y = 60, size = 4, hjust = 0,
label = "Estimates of prevalence based\non individuals accessing services\ncan create an illusion of an\nincrease in prevalence over time,\nyet still underestimate prevalence\nat both time points.")
bar8
set.seed(2018)
dot <- ggplot(asdpop, aes(x))
dot <- dot + geom_jitter(aes(y = y, colour = services),
position = position_jitter(width = .25,
height = 0),
alpha = .75, size = 2)
dot <- dot + scale_x_discrete(name = "", labels = c("Time 1", "Time 2"))
dot <- dot + scale_y_continuous(name = "ASD Cases per 10,000")
dot <- dot + guides(colour = guide_legend(keyheight = 1.5))
dot
dotseg <- dot + scale_colour_manual(values = ff,
name = "",
labels = c("Not accessing \nservices",
"Accessing \nservices"))
dotseg <- dotseg + annotate("text",
x = 1.2, y = 102, size = 4, hjust = 0,
label = "True ASD Prevalence")
dotseg <- dotseg + geom_segment(aes(x = .6, xend = 2.4, y = 100, yend = 100),
lty = 3, lwd = .5, colour = "black")
dotseg <- dotseg + geom_segment(aes(x = .6, xend = 1.3, y = 30, yend = 30),
lty = 3, lwd = .5, colour = ff[2])
dotseg <- dotseg + geom_segment(aes(x = 1.3, xend = 1.7, y = 30, yend = 60),
lty = 3, lwd = .5, colour = ff[2])
dotseg <- dotseg + geom_segment(aes(x = 1.7, xend = 2.4, y = 60, yend = 60),
lty = 3, lwd = .5, colour = ff[2])
dotseg <- dotseg + theme(axis.ticks = element_blank())
dotseg
set.seed(2018)
dotcol <- ggplot(asdpop, aes(x))
dotcol <- dotcol + geom_bar(fill = "white", width = .6)
dotcol <- dotcol + geom_jitter(aes(y = y, colour = services),
position = position_jitter(width = .25,
height = 0),
alpha = .75, size = 2)
dotcol <- dotcol + scale_x_discrete(name = "", labels = c("Time 1", "Time 2"))
dotcol <- dotcol + scale_y_continuous(name = "ASD Cases per 10,000")
dotcol <- dotcol + scale_colour_manual(values = ff,
name = "",
labels = c("Not accessing \nservices",
"Accessing \nservices"))
dotcol <- dotcol + guides(colour = guide_legend(keyheight = 1.5))
dotcol <- dotcol + annotate("text",
x = 1.2, y = 102, size = 4, hjust = 0,
label = "True ASD Prevalence")
dotcol <- dotcol + geom_segment(aes(x = .6, xend = 2.4, y = 100, yend = 100),
lty = 3, lwd = .5, colour = "black")
dotcol <- dotcol + geom_segment(aes(x = .6, xend = 1.3, y = 30, yend = 30),
lty = 3, lwd = .5, colour = ff[2])
dotcol <- dotcol + geom_segment(aes(x = 1.3, xend = 1.7, y = 30, yend = 60),
lty = 3, lwd = .5, colour = ff[2])
dotcol <- dotcol + geom_segment(aes(x = 1.7, xend = 2.4, y = 60, yend = 60),
lty = 3, lwd = .5, colour = ff[2])
dotcol <- dotcol + theme(axis.ticks = element_blank())
dotcol <- dotcol + theme(legend.key=element_blank())
dotcol
set.seed(2018)
dotbw <- ggplot(asdpop, aes(x, y))
dotbw <- dotbw + geom_jitter(aes(colour = services),
position = position_jitter(width = .25,
height = 0),
alpha = .75, size = 2)
dotbw <- dotbw + scale_x_discrete(name = "", labels = c("Time 1", "Time 2"))
dotbw <- dotbw + scale_y_continuous(expand = c(.02, 0),
name = "ASD Cases per 10,000")
dotbw <- dotbw + scale_colour_manual(values = ff,
name = "",
labels = c("Not accessing \nservices",
"Accessing \nservices"))
dotbw <- dotbw + guides(colour = guide_legend(keyheight = 1.5))
dotbw <- dotbw + theme_bw()
dotbw <- dotbw + theme(axis.ticks = element_blank())
dotbw <- dotbw + theme(panel.border = element_blank())
dotbw <- dotbw + theme(panel.grid = element_blank())
dotbw <- dotbw + theme(axis.title.y = element_text(size = 10))
dotbw <- dotbw + theme(axis.text = element_text(size = 10))
dotbw <- dotbw + theme(axis.line = element_line(colour = "gray80"))
dotbw
Time 1 and Time 2 characterized?set.seed(2018)
dotleg <- ggplot(asdpop, aes(x, y))
dotleg <- dotleg + geom_jitter(aes(colour = services),
position = position_jitter(width = .25,
height = 0),
alpha = .75, size = 2)
dotleg <- dotleg + scale_x_discrete(expand = c(0, 0.6),
name = "",
labels = c("Time 1:\nPoor Service Access", "Time 2:\nBetter Service Access"))
dotleg <- dotleg + scale_y_continuous(expand = c(.02, 0),
name = "ASD Cases per 10,000",
breaks = seq(0, 100, by = 20))
dotleg <- dotleg + theme_bw()
dotleg <- dotleg + theme(axis.ticks = element_blank())
dotleg <- dotleg + theme(panel.border = element_blank())
dotleg <- dotleg + theme(panel.grid = element_blank())
dotleg <- dotleg + theme(axis.title.y = element_text(size = 10))
dotleg <- dotleg + theme(axis.text = element_text(size = 10))
dotleg <- dotleg + coord_cartesian(ylim = c(0, 102), xlim = c(1, 3.2))
dotleg <- dotleg + scale_colour_manual(name = "ASD cases who are:",
values = ff,
labels = c("Not accessing services",
"Accessing services"))
dotleg <- dotleg + guides(colour = guide_legend(keywidth = 1.1,
keyheight = 1.1,
override.aes = list(alpha = 1, size = 3)))
dotleg <- dotleg + theme(legend.position=c(.75, .25))
dotleg <- dotleg + theme(legend.text = element_text(size = 10))
dotleg <- dotleg + theme(legend.title = element_text(size = 10))
dotleg <- dotleg + theme(legend.background = element_rect(fill = "gray90",
size=.3,
linetype="dotted"))
dotleg
# lines
dotline <- dotleg + geom_segment(aes(x = .6, xend = 2.4, y = 100, yend = 100),
lty = 3, lwd = .5, colour = "black")
dotline <- dotline + geom_segment(aes(x = .6, xend = 1.3, y = 30, yend = 30),
lty = 3, lwd = .5, colour = ff[2])
dotline <- dotline + geom_segment(aes(x = 1.3, xend = 1.7, y = 30, yend = 60),
lty = 3, lwd = .5, colour = ff[2])
dotline <- dotline + geom_segment(aes(x = 1.7, xend = 2.4, y = 60, yend = 60),
lty = 3, lwd = .5, colour = ff[2])
dotline
dotann <- dotline + annotate("text",
x = 2.5, y = 97, size = 4, hjust = 0,
label = "Estimates of prevalence based\non population sampling will remain\nstable over time if true prevalence\nis stable.")
dotann <- dotann + annotate("text",
x = 2.5, y = 60, size = 4, hjust = 0,
label = "Estimates of prevalence based\non individuals accessing services\ncan create an illusion of an\nincrease in prevalence over time,\nyet still underestimate prevalence\nat both time points.")
dotann
set.seed(2018)
dotprint <- ggplot(asdpop, aes(x, y))
dotprint <- dotprint + geom_jitter(aes(fill = services),
position = position_jitter(width=.25,
height = 0),
pch = 21,
colour = "black",
size = 2)
dotprint <- dotprint + scale_x_discrete(expand = c(0, 0.6),
name = "",
labels = c("Time 1:\nPoor Service Access", "Time 2:\nBetter Service Access"))
dotprint <- dotprint + scale_y_continuous(expand = c(.02, 0),
name = "ASD Cases per 10,000",
breaks = seq(0, 100, by = 20))
dotprint <- dotprint + theme_bw()
dotprint <- dotprint + theme(axis.ticks = element_blank())
dotprint <- dotprint + theme(panel.border = element_blank())
dotprint <- dotprint + theme(panel.grid = element_blank())
dotprint <- dotprint + theme(axis.title.y = element_text(size = 10))
dotprint <- dotprint + theme(axis.text = element_text(size = 10))
dotprint <- dotprint + coord_cartesian(ylim = c(0, 102), xlim = c(1, 3.2))
dotprint <- dotprint + scale_fill_manual(name = "ASD cases who are:",
values = c("black", "white"),
labels = c("Not accessing services",
"Accessing services"))
dotprint <- dotprint + guides(colour = guide_legend(keywidth = 1.1,
keyheight = 1.1,
override.aes = list(alpha = 1, size = 3)))
dotprint <- dotprint + theme(legend.position=c(.75, .25))
dotprint <- dotprint + theme(legend.text = element_text(size = 10))
dotprint <- dotprint + theme(legend.title = element_text(size = 10))
dotprint <- dotprint + theme(legend.background = element_rect(fill = "gray90",
size=.3,
linetype="dotted"))
# lines
dotprint <- dotprint + geom_segment(aes(x = .6, xend = 2.4, y = 100, yend = 100),
lty = 3, lwd = .5, colour = "black")
dotprint <- dotprint + geom_segment(aes(x = .6, xend = 1.3, y = 30, yend = 30),
lty = 3, lwd = .5, colour = "black")
dotprint <- dotprint + geom_segment(aes(x = 1.3, xend = 1.7, y = 30, yend = 60),
lty = 3, lwd = .5, colour = "black")
dotprint <- dotprint + geom_segment(aes(x = 1.7, xend = 2.4, y = 60, yend = 60),
lty = 3, lwd = .5, colour = "black")
dotprint <- dotprint + annotate("text",
x = 2.5, y = 97, size = 4, hjust = 0,
label = "Estimates of prevalence based\non population sampling will remain\nstable over time if true prevalence\nis stable.")
dotprint <- dotprint + annotate("text",
x = 2.5, y = 60, size = 4, hjust = 0,
label = "Estimates of prevalence based\non individuals accessing services\ncan create an illusion of an\nincrease in prevalence over time,\nyet still underestimate prevalence\nat both time points.")
dotprint
Read data:
# migration <- read_rds("migration.rds")
migration <- read_rds("https://github.com/reisanar/datasets/blob/master/migration.rds?raw=true")
cannot open compressed file 'https://github.com/reisanar/datasets/blob/master/migration.rds?raw=true', probable reason 'No such file or directory'Error in gzfile(file, "rb") : cannot open the connection
myurl <- "https://github.com/reisanar/datasets/blob/master/migration.rds?raw=true"
temp <- tempfile() # create a tempfile
download.file(myurl, temp) # download to disk
trying URL 'https://github.com/reisanar/datasets/blob/master/migration.rds?raw=true'
Content type 'application/octet-stream' length 6185196 bytes (5.9 MB)
==================================================
downloaded 5.9 MB
migration <- readRDS(temp) # read the tempfile
unlink(temp) # Deletes tempfile
Check
head(migration)
colnames(migration) %>% head()
[1] "gender" "year" "code" "country_dest" "afghanistan" "albania"
colnames(migration) %>% tail()
[1] "viet_nam" "wallis_and_futuna_islands" "western_sahara"
[4] "yemen" "zambia" "zimbabwe"
G7 countries:
g7 <- c("canada", "france", "germany", "italy", "japan", "united_kingdom", "united_states_of_america")
migration_2019 <- migration %>%
pivot_longer(cols = -c(1:4), names_to = "country_orig", values_to = "n_migrants") %>%
filter(year == 2019) %>%
filter_at(c("country_orig", "country_dest"), ~.x %in% g7) %>%
mutate_at(c("country_orig", "country_dest"), ~case_when(.x == "united_kingdom" ~ "uk", .x == "united_states_of_america" ~ "usa", TRUE ~ .x)) %>%
group_by(country_orig, country_dest) %>%
summarise(n_migrants = sum(n_migrants))
migration_2019 %>% head(4)
library(ggalluvial)
library(scales)
Sankey Diagram (Alluvial Plot)
ggplot(migration_2019,
aes(y = n_migrants, axis1 = country_orig, axis2 = country_dest)) +
geom_alluvium(aes(fill = country_orig)) +
geom_stratum(width = 1/12, fill = "black", color = "grey") +
geom_label(stat = "stratum", infer.label = TRUE) +
scale_x_discrete(limits = c("Origin", "Destination"), expand = c(.05, .05)) +
scale_fill_brewer(type = "qual", palette = "Set1") +
guides(fill = FALSE) +
labs(y = "", title = "G7 Cross-Country Migration 2019")
head(migration)
See more examples here: https://cran.r-project.org/web/packages/ggalluvial/vignettes/ggalluvial.html
Here’s a good example on the type of problems/results for which this type of plot can be very useful: https://towardsdatascience.com/alluvial-diagrams-783bbbbe0195
This blogpost also does a good job at motivating the use of alluvial diagrams: https://www.thinkingondata.com/alluvial-diagram/
data(majors)
Check
majors
Adjust data type:
majors$curriculum <- factor(majors$curriculum)
Quick plot:
ggplot(data = majors,
aes(x = semester, stratum = curriculum, alluvium = student,
fill = curriculum, label = curriculum)) +
geom_stratum() +
geom_flow() +
theme(legend.position = "bottom") +
labs(title = "Student curricula across several semesters")
The stratum heights y are unspecified, so each row is given unit height. This example demonstrates one way ggalluvial handles missing data. The alternative is to set the parameter na.rm to TRUE. Missing data handling (specifically, the order of the strata) also depends on whether the stratum variable is character or factor/numeric.
ggplot(data = majors,
aes(x = semester, stratum = curriculum, alluvium = student,
fill = curriculum, label = curriculum)) +
geom_stratum(na.rm = TRUE) +
geom_flow(na.rm = TRUE) +
theme(legend.position = "bottom") +
labs(title = "Student curricula across several semesters")